library(tidyverse)

MMETSP

Read raw data.

mmetsp_raw_data <- read_tsv('../data/sample-attr.tab.txt')

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────
cols(
  sample_id = col_double(),
  sample_name = col_character(),
  attr_type = col_character(),
  attr_value = col_character()
)
mmetsp_raw_data

Unpack attributes.

mmetsp_wider <- mmetsp_raw_data %>%
  rename(sample_name_main = sample_name) %>%
  pivot_wider(id_cols = c('sample_id', 'sample_name_main'), names_from = "attr_type", values_from = "attr_value", names_repair = "unique")
Values are not uniquely identified; output will contain list-cols.
* Use `values_fn = list` to suppress this warning.
* Use `values_fn = length` to identify where the duplicates arise
* Use `values_fn = {summary_fun}` to summarise duplicates
mmetsp_wider

Select and unnest taxon info.

mmetsp_taxon <- mmetsp_wider %>%
  select(sample_id, sample_name_main, taxon_id, phylum, class, order, genus, species, strain) %>%
  unnest() %>%
  mutate(
    genus_species_strain = gsub(" ", "_", paste(genus, species, strain, sep = "_"))
  )
`cols` is now required when using unnest().
Please use `cols = c(taxon_id, phylum, class, order, genus, species, strain)`
mmetsp_taxon

Select only barebones.

mmetsp_select <- mmetsp_taxon %>%
  select('sample_id', 'sample_name_main', 'taxon_id', 'genus_species_strain')

Genbank

colNames <- "assembly_accession, bioproject, biosample, wgs_master, refseq_category, taxid, species_taxid, organism_name, infraspecific_name, infraspecific_name2, isolateversion_status, assembly_level, release_type, genome_rep, seq_rel_date, asm_name, submitter, gbrs_paired_asm, paired_asm_comp, ftp_path, excluded_from_refseq, relation_to_type_material"
colNamesVec <- unlist(str_split(colNames, ", "))
genbank <- read_tsv('../data/assembly_summary_genbank.txt',
                    comment = "#",
                    col_names = colNamesVec) %>%
  mutate(taxid = as.character(taxid),
         species_taxid = as.character(species_taxid))

── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  taxid = col_double(),
  species_taxid = col_double(),
  seq_rel_date = col_date(format = "")
)
ℹ Use `spec()` for the full column specifications.

46385 parsing failures.
row col   expected     actual                                   file
  3  -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
 12  -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
 26  -- 22 columns 10 columns '../data/assembly_summary_genbank.txt'
 95  -- 22 columns 9 columns  '../data/assembly_summary_genbank.txt'
 96  -- 22 columns 9 columns  '../data/assembly_summary_genbank.txt'
... ... .......... .......... ......................................
See problems(...) for more details.
genbank_select <- genbank %>%
  select('taxid', 'species_taxid', 'organism_name', 'genome_rep', 'ftp_path')

genbank_select

Join

mmetsp_taxon
genbank_select <- genbank_select %>%
  mutate(taxid = as.character(taxid),
         species_taxid = as.character(species_taxid))
genbank_select

suffix <- "_genomic.fna.gz"

join_taxid <- inner_join(mmetsp_taxon, genbank, by = c('taxon_id' = 'taxid')) %>%
  distinct(sample_id, .keep_all = TRUE) %>%
  distinct(taxon_id, .keep_all = TRUE) %>%
  rowwise() %>%
  mutate(
    genome_filename = paste(tail(str_split(ftp_path, '/')[[1]], 1), suffix, sep = ""),
    
    genome_ftp_path = paste(ftp_path, genome_filename, sep = "/")
  )
  
join_taxid
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCiMgTU1FVFNQCgpSZWFkIHJhdyBkYXRhLgpgYGB7cn0KbW1ldHNwX3Jhd19kYXRhIDwtIHJlYWRfdHN2KCcuLi9kYXRhL3NhbXBsZS1hdHRyLnRhYi50eHQnKQptbWV0c3BfcmF3X2RhdGEKYGBgCgpVbnBhY2sgYXR0cmlidXRlcy4KYGBge3J9Cm1tZXRzcF93aWRlciA8LSBtbWV0c3BfcmF3X2RhdGEgJT4lCiAgcmVuYW1lKHNhbXBsZV9uYW1lX21haW4gPSBzYW1wbGVfbmFtZSkgJT4lCiAgcGl2b3Rfd2lkZXIoaWRfY29scyA9IGMoJ3NhbXBsZV9pZCcsICdzYW1wbGVfbmFtZV9tYWluJyksIG5hbWVzX2Zyb20gPSAiYXR0cl90eXBlIiwgdmFsdWVzX2Zyb20gPSAiYXR0cl92YWx1ZSIsIG5hbWVzX3JlcGFpciA9ICJ1bmlxdWUiKQoKbW1ldHNwX3dpZGVyCmBgYApTZWxlY3QgYW5kIHVubmVzdCB0YXhvbiBpbmZvLgpgYGB7cn0KbW1ldHNwX3RheG9uIDwtIG1tZXRzcF93aWRlciAlPiUKICBzZWxlY3Qoc2FtcGxlX2lkLCBzYW1wbGVfbmFtZV9tYWluLCB0YXhvbl9pZCwgcGh5bHVtLCBjbGFzcywgb3JkZXIsIGdlbnVzLCBzcGVjaWVzLCBzdHJhaW4pICU+JQogIHVubmVzdCgpICU+JQogIG11dGF0ZSgKICAgIGdlbnVzX3NwZWNpZXNfc3RyYWluID0gZ3N1YigiICIsICJfIiwgcGFzdGUoZ2VudXMsIHNwZWNpZXMsIHN0cmFpbiwgc2VwID0gIl8iKSkKICApCm1tZXRzcF90YXhvbgpgYGAKClNlbGVjdCBvbmx5IGJhcmVib25lcy4KYGBge3J9Cm1tZXRzcF9zZWxlY3QgPC0gbW1ldHNwX3RheG9uICU+JQogIHNlbGVjdCgnc2FtcGxlX2lkJywgJ3NhbXBsZV9uYW1lX21haW4nLCAndGF4b25faWQnLCAnZ2VudXNfc3BlY2llc19zdHJhaW4nKQpgYGAKCiMgR2VuYmFuawoKYGBge3J9CmNvbE5hbWVzIDwtICJhc3NlbWJseV9hY2Nlc3Npb24sIGJpb3Byb2plY3QsIGJpb3NhbXBsZSwgd2dzX21hc3RlciwgcmVmc2VxX2NhdGVnb3J5LCB0YXhpZCwgc3BlY2llc190YXhpZCwgb3JnYW5pc21fbmFtZSwgaW5mcmFzcGVjaWZpY19uYW1lLCBpbmZyYXNwZWNpZmljX25hbWUyLCBpc29sYXRldmVyc2lvbl9zdGF0dXMsIGFzc2VtYmx5X2xldmVsLCByZWxlYXNlX3R5cGUsIGdlbm9tZV9yZXAsIHNlcV9yZWxfZGF0ZSwgYXNtX25hbWUsIHN1Ym1pdHRlciwgZ2Jyc19wYWlyZWRfYXNtLCBwYWlyZWRfYXNtX2NvbXAsIGZ0cF9wYXRoLCBleGNsdWRlZF9mcm9tX3JlZnNlcSwgcmVsYXRpb25fdG9fdHlwZV9tYXRlcmlhbCIKY29sTmFtZXNWZWMgPC0gdW5saXN0KHN0cl9zcGxpdChjb2xOYW1lcywgIiwgIikpCmBgYAoKYGBge3J9CmdlbmJhbmsgPC0gcmVhZF90c3YoJy4uL2RhdGEvYXNzZW1ibHlfc3VtbWFyeV9nZW5iYW5rLnR4dCcsCiAgICAgICAgICAgICAgICAgICAgY29tbWVudCA9ICIjIiwKICAgICAgICAgICAgICAgICAgICBjb2xfbmFtZXMgPSBjb2xOYW1lc1ZlYykgJT4lCiAgbXV0YXRlKHRheGlkID0gYXMuY2hhcmFjdGVyKHRheGlkKSwKICAgICAgICAgc3BlY2llc190YXhpZCA9IGFzLmNoYXJhY3RlcihzcGVjaWVzX3RheGlkKSkKYGBgCgpgYGB7cn0KZ2VuYmFua19zZWxlY3QgPC0gZ2VuYmFuayAlPiUKICBzZWxlY3QoJ3RheGlkJywgJ3NwZWNpZXNfdGF4aWQnLCAnb3JnYW5pc21fbmFtZScsICdnZW5vbWVfcmVwJywgJ2Z0cF9wYXRoJykKCmdlbmJhbmtfc2VsZWN0CmBgYAoKIyBKb2luCgpgYGB7cn0KbW1ldHNwX3RheG9uCmBgYAoKYGBge3J9CmdlbmJhbmtfc2VsZWN0IDwtIGdlbmJhbmtfc2VsZWN0ICU+JQogIG11dGF0ZSh0YXhpZCA9IGFzLmNoYXJhY3Rlcih0YXhpZCksCiAgICAgICAgIHNwZWNpZXNfdGF4aWQgPSBhcy5jaGFyYWN0ZXIoc3BlY2llc190YXhpZCkpCmdlbmJhbmtfc2VsZWN0CmBgYApgYGB7cn0KZ2VuYmFuawpgYGAKCmBgYHtyfQpzdWZmaXggPC0gIl9nZW5vbWljLmZuYS5neiIKCmpvaW5fdGF4aWQgPC0gaW5uZXJfam9pbihtbWV0c3BfdGF4b24sIGdlbmJhbmssIGJ5ID0gYygndGF4b25faWQnID0gJ3RheGlkJykpICU+JQogIGRpc3RpbmN0KHNhbXBsZV9pZCwgLmtlZXBfYWxsID0gVFJVRSkgJT4lCiAgZGlzdGluY3QodGF4b25faWQsIC5rZWVwX2FsbCA9IFRSVUUpICU+JQogIHJvd3dpc2UoKSAlPiUKICBtdXRhdGUoCiAgICBnZW5vbWVfZmlsZW5hbWUgPSBwYXN0ZSh0YWlsKHN0cl9zcGxpdChmdHBfcGF0aCwgJy8nKVtbMV1dLCAxKSwgc3VmZml4LCBzZXAgPSAiIiksCiAgICAKICAgIGdlbm9tZV9mdHBfcGF0aCA9IHBhc3RlKGZ0cF9wYXRoLCBnZW5vbWVfZmlsZW5hbWUsIHNlcCA9ICIvIikKICApCiAgCmpvaW5fdGF4aWQKYGBgCmBgYHtyfQp3cml0ZV9jc3Yoam9pbl90YXhpZCwgJy4uL2RhdGEvbW1ldHNwX25jYmlfZ2Vub21lX2luZm8uY3N2JykKYGBgCgoK